In [ ]:
import warnings
warnings.filterwarnings("ignore")
from torchmetrics.classification import BinaryF1Score as F1Score
import matplotlib.pyplot as plt
from anomalib.models import Padim
from anomalib.engine import Engine
from anomalib.data import PredictDataset, MVTec
from anomalib import TaskType
from anomalib.deploy import ExportType, OpenVINOInferencer, CompressionType
from anomalib.utils.visualization.image import ImageVisualizer, VisualizationMode, ImageResult
from anomalib.metrics import AUROC
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast
from PIL import Image
from pynvml.smi import nvidia_smi
import numpy as np
from pathlib import Path
from timeit import default_timer as timer
import re
import sys
import psutil
import shutil
sys.path.append("anomalib_trt_python")
from anomalib_trt_python.trt_inferencer import TrtInferencer
import json
import torch
torch.set_float32_matmul_precision('high')
torch.manual_seed(21)
np.random.seed(21)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nvsmi = nvidia_smi.getInstance()
print(f"Using device: {device}")
print(torch.cuda.get_device_name(0))
task = TaskType.SEGMENTATION
Using device: cuda
NVIDIA GeForce RTX 4070 Laptop GPU
In [ ]:
# Choose a category
categories = ['transistor', 'bottle', 'carpet', 'hazelnut', 'screw']
category = categories[1] # bottle

# Choose a test folder for inference
specific_folder = "broken_large"
inference_path = Path(f'MVTec_test/{category}/test/{specific_folder}')

# Create folder for category
import os
if not os.path.exists(f"./{category}"):
    os.makedirs(f"./{category}")
In [ ]:
def test_model(engine, data, inference, model=None):
    img_visualizer = ImageVisualizer(mode=VisualizationMode.FULL, task=task)
    output_images = []
    total_time = 0
    inferences = ['onnx', 'trt', 'torch_fp32', 'torch_fp16', 'openvino']
    if inference not in inferences:
        raise ValueError(f'Invalid inference type. Choose from {inferences}')
    if inference == 'onnx' or inference == 'trt' or inference == 'openvino':
        for img_path in data:
            time_0 = timer()
            pred_img_result = engine.predict(img_path)
            time_1 = timer()
            if inference != 'trt':
                max_memory_allocated = psutil.virtual_memory().used
            else:
                max_memory_allocated = nvsmi.DeviceQuery('memory.used')
                max_memory_allocated = max_memory_allocated['gpu'][0]['fb_memory_usage']['used']
            inference_time = time_1 - time_0
            total_time += inference_time
            gt = re.sub(r'\btest\b', 'ground_truth', img_path)
            gt = re.sub(r'\b.png\b', '_mask.png', gt)
            if pred_img_result.image.shape[:2] == (256, 256):
                gt_img = np.array(Image.open(gt).resize((256, 256)))
            else:
                gt_img = np.array(Image.open(gt))
            pred_img_result.gt_mask = gt_img
            output_images.append((img_visualizer.visualize_image(pred_img_result), pred_img_result))
    else:
        # data is a dataloader
        if model is None:
            raise ValueError('Model is required for torch inference')
        model.eval()
        model.to(device)
        if inference == 'torch_fp16':
            with autocast():
                torch.cuda.empty_cache()
                torch.cuda.reset_peak_memory_stats()
                time_0 = timer()
                predictions = engine.predict(model, data)
                time_1 = timer()
                max_memory_allocated = torch.cuda.max_memory_allocated(device)
                total_time = time_1 - time_0
        else:
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            time_0 = timer()
            predictions = engine.predict(model, data)
            time_1 = timer()
            max_memory_allocated = torch.cuda.max_memory_allocated(device)
            total_time = time_1 - time_0
        for i in range(len(predictions)):
            pred = predictions[i]
            image_path = pred["image_path"][0]
            image_size = pred["image"].shape[-2:]
            image = np.array(Image.open(image_path).resize(image_size))
            anomaly_map = pred["anomaly_maps"][0]
            anomaly_map = anomaly_map.cpu().numpy().squeeze()
            gt = re.sub(r'\btest\b', 'ground_truth', image_path)
            gt = re.sub(r'\b.png\b', '_mask.png', gt)
            visualizer = ImageVisualizer(mode=VisualizationMode.FULL, task=task)
            image_result = ImageResult(
                            image=image,
                            pred_score=pred["pred_scores"][0].cpu().numpy().item() if "pred_scores" in pred else None,
                            pred_label=pred["pred_labels"][0].cpu().numpy().item() if "pred_labels" in pred else None,
                            anomaly_map=pred["anomaly_maps"][0].cpu().numpy() if "anomaly_maps" in pred else None,
                            pred_mask=pred["pred_masks"][0].squeeze().int().cpu().numpy() if "pred_masks" in pred else None,
                            gt_mask=pred["mask"][0].squeeze().int().cpu().numpy() if "mask" in pred else None,
                            gt_boxes=pred["boxes"][0].cpu().numpy() if "boxes" in pred else None,
                            pred_boxes=pred["pred_boxes"][0].cpu().numpy() if "pred_boxes" in pred else None,
                            box_labels=pred["box_labels"][0].cpu().numpy() if "box_labels" in pred else None,
                        )
            gt_img_np = np.array(Image.open(gt).resize(image_size))
            image_result.gt_mask = gt_img_np
            output_images.append((visualizer.visualize_image(image_result), image_result))
    
    sum_f1_pixel = 0
    sum_f1_img = 0
    sum_auroc_pixel = 0
    sum_auroc_img = 0
    fig, ax = plt.subplots(len(output_images), 1, figsize=(20, 20))
    i = 0
    for img, pred_img_result in output_images:
        if len(output_images) > 1:
            ax[i].imshow(img)
            ax[i].axis('off')
            i += 1
        else:
            ax.imshow(img)
            ax.axis('off')


        auroc = AUROC().to(device)
        pred_mask = torch.from_numpy(pred_img_result.pred_mask).to(device)
        gt_mask = torch.from_numpy(pred_img_result.gt_mask).to(device)
        image_scores = torch.mean(pred_mask.view(pred_mask.shape[0], -1).float(), dim=1)
        gt_image_labels = torch.any(gt_mask.view(gt_mask.shape[0], -1) > 0, dim=1).long()
        img_lvl_auroc = auroc(image_scores, gt_image_labels)

        gt_mask = torch.where(gt_mask > 0, torch.tensor(1, device=gt_mask.device), torch.tensor(0, device=gt_mask.device))
        pred_mask_flat = pred_mask.view(-1).float()
        gt_mask_flat = gt_mask.view(-1).long()
        pixel_lvl_auroc = auroc(pred_mask_flat, gt_mask_flat)

        f1_score = F1Score().to(device)
        pixel_lvl_f1 = f1_score(pred_mask_flat, gt_mask_flat)
        img_lvl_f1 = f1_score(image_scores, gt_image_labels)
        sum_f1_pixel += pixel_lvl_f1
        sum_f1_img += img_lvl_f1
        sum_auroc_pixel += pixel_lvl_auroc
        sum_auroc_img += img_lvl_auroc
    print(f'Average pixel-level F1 score: {sum_f1_pixel / len(output_images)}')
    print(f'Average image-level F1 score: {sum_f1_img / len(output_images)}')
    print(f'Average pixel-level AUROC: {sum_auroc_pixel / len(output_images)}')
    print(f'Average image-level AUROC: {sum_auroc_img / len(output_images)}')
    output_path = Path(f'./{category}/output_{inference}.png')
    fig.tight_layout()
    plt.savefig(output_path, bbox_inches='tight', transparent=True, pad_inches=0)
    return output_path, total_time, max_memory_allocated, output_images
In [ ]:
# Torch FP32 model
model_fp32 = Padim(backbone='wide_resnet50_2')
engine_fp32 = Engine()

fit = True # Set to True if you want to retrain the model, if False and model exists, it will be loaded from disk

if Path(f"./{category}/padim_fp32_torch.pth").exists() and not fit:
    model_fp32.load_state_dict(torch.load(f"./{category}/padim_fp32_torch.pth"))
    print("Loaded model from disk")
else:
    dataset_fit = Path("./MVTec_fit")
    datamodule_fp32 = MVTec(
        root=dataset_fit,
        category=category,
        image_size=256,
        train_batch_size=32,
        eval_batch_size=32,
        num_workers=4,
        task=task
    )
    # Train the model
    engine_fp32.fit(datamodule=datamodule_fp32, model=model_fp32)
    print("Model trained")
    # Save the fp32 model
    torch.save(model_fp32.state_dict(), f"./{category}/padim_fp32_torch.pth")
    print("Model saved to disk")
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..
F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead
F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
┏━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃   ┃ Name                  ┃ Type                     ┃ Params ┃ Mode  ┃
┡━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ 0 │ model                 │ PadimModel               │ 24.9 M │ train │
│ 1 │ _transform            │ Compose                  │      0 │ train │
│ 2 │ normalization_metrics │ MinMax                   │      0 │ train │
│ 3 │ image_threshold       │ F1AdaptiveThreshold      │      0 │ train │
│ 4 │ pixel_threshold       │ F1AdaptiveThreshold      │      0 │ train │
│ 5 │ image_metrics         │ AnomalibMetricCollection │      0 │ train │
│ 6 │ pixel_metrics         │ AnomalibMetricCollection │      0 │ train │
└───┴───────────────────────┴──────────────────────────┴────────┴───────┘
Trainable params: 24.9 M                                                                                           
Non-trainable params: 0                                                                                            
Total params: 24.9 M                                                                                               
Total estimated model params size (MB): 99                                                                         
Output()
`Trainer.fit` stopped: `max_epochs=1` reached.


Model trained
Model saved to disk
In [ ]:
# Inference FP32 model
inference_dataset_fp32 = PredictDataset(path=inference_path)
inference_dataloader_fp32 = DataLoader(dataset=inference_dataset_fp32)

output_path_FP32, inference_time_FP32, memory_FP32, fp32_output_images = test_model(engine_fp32, inference_dataloader_fp32, 'torch_fp32', model=model_fp32)
print(f"Time taken for inference (FP32): {inference_time_FP32:.2f} seconds, memory used: {memory_FP32 / 1024**3:.2f} GB")
Image.open(output_path_FP32)
ckpt_path is not provided. Model weights will not be loaded.
F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead
F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Output()


Average pixel-level F1 score: 0.7377685904502869
Average image-level F1 score: 0.9502601027488708
Average pixel-level AUROC: 0.9307771921157837
Average image-level AUROC: 0.9621909260749817
Time taken for inference (FP32): 3.24 seconds, memory used: 4.83 GB
Out[ ]:
No description has been provided for this image
In [ ]:
# Torch FP16 model
#
# Load the FP32 model and convert it to FP16
# If the FP16 model already exists and fit is False, load it from disk
if Path(f"./{category}/padim_fp16_torch.pth").exists() and not fit:
    model_fp16 = Padim(backbone='wide_resnet50_2')
    model_fp16.load_state_dict(torch.load(f"./{category}/padim_fp16_torch.pth"))
    print("Loaded model from disk")
# Convert the FP32 model to FP16 and save it to disk if it doesn't exist or fit is True
else:
    model_fp32_to_16 = Padim(backbone='wide_resnet50_2')
    model_fp32_to_16.load_state_dict(torch.load(f"./{category}/padim_fp32_torch.pth"))
    model_fp16 = model_fp32_to_16.half()
    print("Converted the FP32 model and saved it to disk")
    # Save the FP16 model
    torch.save(model_fp16.state_dict(), f"./{category}/padim_fp16_torch.pth")
Converted the FP32 model and saved it to disk
In [ ]:
engine_fp16 = Engine()

# Inference FP16 model
inference_dataset_fp16 = PredictDataset(path=inference_path)
inference_dataloader_fp16 = DataLoader(dataset=inference_dataset_fp16)

output_path_fp16, inference_time_fp16, memory_fp16, fp_16_output_images = test_model(engine_fp16, inference_dataloader_fp16, 'torch_fp16', model_fp16)
print(f"Time taken for inference: {inference_time_fp16:.2f} seconds, memory used: {memory_fp16 / 1024**3:.2f} GB")
Image.open(output_path_fp16)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.rich_model_summary.RichModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..
ckpt_path is not provided. Model weights will not be loaded.
F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead
F1Score class exists for backwards compatibility. It will be removed in v1.1. Please use BinaryF1Score from torchmetrics instead
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Output()


Average pixel-level F1 score: 0.7377628684043884
Average image-level F1 score: 0.9502601027488708
Average pixel-level AUROC: 0.9307877421379089
Average image-level AUROC: 0.9621909260749817
Time taken for inference: 1.86 seconds, memory used: 2.40 GB
Out[ ]:
No description has been provided for this image
In [ ]:
# ONNX FP32 model
#
# Export from Torch FP32 to ONNX
# If the ONNX model already exists and fit is False, load it from disk
if Path(f"./{category}/padim_fp32_onnx/weights/onnx/model.onnx").exists() and not fit:
    print("ONNX model already exists")
# Export the model to ONNX and save it to disk if it doesn't exist or fit is True
else:
    model_fp32.eval()
    # Delete the folder if it exists to avoid errors
    if Path(f"./{category}/padim_fp32_onnx").exists():
        shutil.rmtree(f"./{category}/padim_fp32_onnx")
    engine_fp32.export(model=model_fp32, export_type=ExportType.ONNX, export_root=f'./{category}/padim_fp32_onnx', input_size=(256, 256))
    print("ONNX model exported and saved to disk")
ONNX model exported and saved to disk
In [ ]:
# Inference ONNX model
inferencer_onnx = OpenVINOInferencer(
    path=f"./{category}/padim_fp32_onnx/weights/onnx/model.onnx",
    metadata=f"./{category}/padim_fp32_onnx/weights/onnx/metadata.json",
    task=task
)
data = [str(img) for img in inference_path.glob('*.png')]
output_path_onnx, inference_time_onnx, memory_onnx, onnx_output_images = test_model(inferencer_onnx, data, 'onnx')
print(f"Inference time: {inference_time_onnx:.2f} seconds, Memory: {memory_onnx/1024**3:.2f} GB")
Image.open(output_path_onnx)
Average pixel-level F1 score: 0.7181596159934998
Average image-level F1 score: 0.9381791353225708
Average pixel-level AUROC: 0.9275180101394653
Average image-level AUROC: 0.9510656595230103
Inference time: 0.58 seconds, Memory: 23.22 GB
Out[ ]:
No description has been provided for this image
In [ ]:
# OpenVINO FP16 model
#
# Export from Torch FP32 to OpenVINO
# If the OpenVINO model already exists and fit is False, load it from disk
if Path(f"./{category}/padim_fp16_openvino/weights/openvino/model.xml").exists() and not fit:
    print("OpenVINO model already exists")
# Export the model to OpenVINO and save it to disk if it doesn't exist or fit is True
else:
    model_fp32.eval()
    if Path(f"./{category}/padim_fp16_openvino").exists():
        shutil.rmtree(f"./{category}/padim_fp16_openvino")
    engine_fp32.export(model=model_fp32, export_type=ExportType.OPENVINO, export_root=f'./{category}/padim_fp16_openvino', compression_type=CompressionType.FP16, input_size=(256, 256))
    print("OpenVINO model exported and saved to disk")
OpenVINO model exported and saved to disk
In [ ]:
# Inference OpenVINO model
inferencer_openvino = OpenVINOInferencer(
    path=f"./{category}/padim_fp16_openvino/weights/openvino/model.xml",
    metadata=f"./{category}/padim_fp16_openvino/weights/openvino/metadata.json",
    task=task,
)
data = [str(img) for img in inference_path.glob('*.png')]
output_path_openvino, inference_time_openvino, memory_openvino, openvino_output_images = test_model(inferencer_openvino, data, 'openvino')
print(f"Inference time: {inference_time_openvino:.2f} seconds, Memory: {memory_openvino / 1024**3:.2f} GB")
Image.open(output_path_openvino)
Average pixel-level F1 score: 0.7168790698051453
Average image-level F1 score: 0.9381791353225708
Average pixel-level AUROC: 0.9271915555000305
Average image-level AUROC: 0.9503898620605469
Inference time: 0.49 seconds, Memory: 19.75 GB
Out[ ]:
No description has been provided for this image
In [ ]:
# From ONNX to TensorRT
# If the TensorRT engine already exists and fit is False, load it from disk
if Path(f"./{category}/padim_fp32_trt.engine").exists() and not fit:
    print("TensorRT engine already exists")
# Convert the ONNX model to TensorRT and save it to disk if it doesn't exist or fit is True
else:
    onnx_path = f"./{category}/padim_fp32_onnx/weights/onnx/model.onnx"
    engine_path = f"./{category}/padim_fp32_trt.engine"
    !trtexec --onnx={onnx_path} --saveEngine={engine_path}
    print("TensorRT engine created and saved to disk")
TensorRT engine created and saved to disk&&&& RUNNING TensorRT.trtexec [TensorRT v100100] # trtexec --onnx=./bottle/padim_fp32_onnx/weights/onnx/model.onnx --saveEngine=./bottle/padim_fp32_trt.engine
[07/08/2024-15:31:13] [I] === Model Options ===
[07/08/2024-15:31:13] [I] Format: ONNX
[07/08/2024-15:31:13] [I] Model: ./bottle/padim_fp32_onnx/weights/onnx/model.onnx
[07/08/2024-15:31:13] [I] Output:
[07/08/2024-15:31:13] [I] === Build Options ===
[07/08/2024-15:31:13] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default, tacticSharedMem: default
[07/08/2024-15:31:13] [I] avgTiming: 8
[07/08/2024-15:31:13] [I] Precision: FP32
[07/08/2024-15:31:13] [I] LayerPrecisions: 
[07/08/2024-15:31:13] [I] Layer Device Types: 
[07/08/2024-15:31:13] [I] Calibration: 
[07/08/2024-15:31:13] [I] Refit: Disabled
[07/08/2024-15:31:13] [I] Strip weights: Disabled
[07/08/2024-15:31:13] [I] Version Compatible: Disabled
[07/08/2024-15:31:13] [I] ONNX Plugin InstanceNorm: Disabled
[07/08/2024-15:31:13] [I] TensorRT runtime: full
[07/08/2024-15:31:13] [I] Lean DLL Path: 
[07/08/2024-15:31:13] [I] Tempfile Controls: { in_memory: allow, temporary: allow }
[07/08/2024-15:31:13] [I] Exclude Lean Runtime: Disabled
[07/08/2024-15:31:13] [I] Sparsity: Disabled
[07/08/2024-15:31:13] [I] Safe mode: Disabled
[07/08/2024-15:31:13] [I] Build DLA standalone loadable: Disabled
[07/08/2024-15:31:13] [I] Allow GPU fallback for DLA: Disabled
[07/08/2024-15:31:13] [I] DirectIO mode: Disabled
[07/08/2024-15:31:13] [I] Restricted mode: Disabled
[07/08/2024-15:31:13] [I] Skip inference: Disabled
[07/08/2024-15:31:13] [I] Save engine: ./bottle/padim_fp32_trt.engine
[07/08/2024-15:31:13] [I] Load engine: 
[07/08/2024-15:31:13] [I] Profiling verbosity: 0
[07/08/2024-15:31:13] [I] Tactic sources: Using default tactic sources
[07/08/2024-15:31:13] [I] timingCacheMode: local
[07/08/2024-15:31:13] [I] timingCacheFile: 
[07/08/2024-15:31:13] [I] Enable Compilation Cache: Enabled
[07/08/2024-15:31:13] [I] errorOnTimingCacheMiss: Disabled
[07/08/2024-15:31:13] [I] Preview Features: Use default preview flags.
[07/08/2024-15:31:13] [I] MaxAuxStreams: -1
[07/08/2024-15:31:13] [I] BuilderOptimizationLevel: -1
[07/08/2024-15:31:13] [I] Calibration Profile Index: 0
[07/08/2024-15:31:13] [I] Weight Streaming: Disabled
[07/08/2024-15:31:13] [I] Debug Tensors: 
[07/08/2024-15:31:13] [I] Input(s)s format: fp32:CHW
[07/08/2024-15:31:13] [I] Output(s)s format: fp32:CHW
[07/08/2024-15:31:13] [I] Input build shapes: model
[07/08/2024-15:31:13] [I] Input calibration shapes: model
[07/08/2024-15:31:13] [I] === System Options ===
[07/08/2024-15:31:13] [I] Device: 0
[07/08/2024-15:31:13] [I] DLACore: 
[07/08/2024-15:31:13] [I] Plugins:
[07/08/2024-15:31:13] [I] setPluginsToSerialize:
[07/08/2024-15:31:13] [I] dynamicPlugins:
[07/08/2024-15:31:13] [I] ignoreParsedPluginLibs: 0
[07/08/2024-15:31:13] [I] 
[07/08/2024-15:31:13] [I] === Inference Options ===
[07/08/2024-15:31:13] [I] Batch: Explicit
[07/08/2024-15:31:13] [I] Input inference shapes: model
[07/08/2024-15:31:13] [I] Iterations: 10
[07/08/2024-15:31:13] [I] Duration: 3s (+ 200ms warm up)
[07/08/2024-15:31:13] [I] Sleep time: 0ms
[07/08/2024-15:31:13] [I] Idle time: 0ms
[07/08/2024-15:31:13] [I] Inference Streams: 1
[07/08/2024-15:31:13] [I] ExposeDMA: Disabled
[07/08/2024-15:31:13] [I] Data transfers: Enabled
[07/08/2024-15:31:13] [I] Spin-wait: Disabled
[07/08/2024-15:31:13] [I] Multithreading: Disabled
[07/08/2024-15:31:13] [I] CUDA Graph: Disabled
[07/08/2024-15:31:13] [I] Separate profiling: Disabled
[07/08/2024-15:31:13] [I] Time Deserialize: Disabled
[07/08/2024-15:31:13] [I] Time Refit: Disabled
[07/08/2024-15:31:13] [I] NVTX verbosity: 0
[07/08/2024-15:31:13] [I] Persistent Cache Ratio: 0
[07/08/2024-15:31:13] [I] Optimization Profile Index: 0
[07/08/2024-15:31:13] [I] Weight Streaming Budget: 100.000000%
[07/08/2024-15:31:13] [I] Inputs:
[07/08/2024-15:31:13] [I] Debug Tensor Save Destinations:
[07/08/2024-15:31:13] [I] === Reporting Options ===
[07/08/2024-15:31:13] [I] Verbose: Disabled
[07/08/2024-15:31:13] [I] Averages: 10 inferences
[07/08/2024-15:31:13] [I] Percentiles: 90,95,99
[07/08/2024-15:31:13] [I] Dump refittable layers:Disabled
[07/08/2024-15:31:13] [I] Dump output: Disabled
[07/08/2024-15:31:13] [I] Profile: Disabled
[07/08/2024-15:31:13] [I] Export timing to JSON file: 
[07/08/2024-15:31:13] [I] Export output to JSON file: 
[07/08/2024-15:31:13] [I] Export profile to JSON file: 
[07/08/2024-15:31:13] [I] 
[07/08/2024-15:31:13] [I] === Device Information ===
[07/08/2024-15:31:13] [I] Available Devices: 
[07/08/2024-15:31:13] [I]   Device 0: "NVIDIA GeForce RTX 4070 Laptop GPU" UUID: GPU-e58646d2-c4c6-37c2-ebcb-9018ebe88a47
[07/08/2024-15:31:13] [I] Selected Device: NVIDIA GeForce RTX 4070 Laptop GPU
[07/08/2024-15:31:13] [I] Selected Device ID: 0
[07/08/2024-15:31:13] [I] Selected Device UUID: GPU-e58646d2-c4c6-37c2-ebcb-9018ebe88a47
[07/08/2024-15:31:13] [I] Compute Capability: 8.9
[07/08/2024-15:31:13] [I] SMs: 36
[07/08/2024-15:31:13] [I] Device Global Memory: 8187 MiB
[07/08/2024-15:31:13] [I] Shared Memory per SM: 100 KiB
[07/08/2024-15:31:13] [I] Memory Bus Width: 128 bits (ECC disabled)
[07/08/2024-15:31:13] [I] Application Compute Clock Rate: 1.695 GHz
[07/08/2024-15:31:13] [I] Application Memory Clock Rate: 8.001 GHz
[07/08/2024-15:31:13] [I] 
[07/08/2024-15:31:13] [I] Note: The application clock rates do not reflect the actual clock rates that the GPU is currently running at.
[07/08/2024-15:31:13] [I] 
[07/08/2024-15:31:13] [I] TensorRT version: 10.1.0
[07/08/2024-15:31:13] [I] Loading standard plugins
[07/08/2024-15:31:13] [I] [TRT] [MemUsageChange] Init CUDA: CPU +80, GPU +0, now: CPU 20737, GPU 1131 (MiB)
[07/08/2024-15:31:21] [I] [TRT] [MemUsageChange] Init builder kernel library: CPU +2402, GPU +290, now: CPU 23507, GPU 1421 (MiB)
[07/08/2024-15:31:21] [I] Start parsing network model.
[07/08/2024-15:31:21] [I] [TRT] ----------------------------------------------------------------
[07/08/2024-15:31:21] [I] [TRT] Input filename:   ./bottle/padim_fp32_onnx/weights/onnx/model.onnx
[07/08/2024-15:31:21] [I] [TRT] ONNX IR version:  0.0.7
[07/08/2024-15:31:21] [I] [TRT] Opset version:    14
[07/08/2024-15:31:21] [I] [TRT] Producer name:    pytorch
[07/08/2024-15:31:21] [I] [TRT] Producer version: 2.3.1
[07/08/2024-15:31:21] [I] [TRT] Domain:           
[07/08/2024-15:31:21] [I] [TRT] Model version:    0
[07/08/2024-15:31:21] [I] [TRT] Doc string:       
[07/08/2024-15:31:21] [I] [TRT] ----------------------------------------------------------------
[07/08/2024-15:31:28] [I] Finished parsing network model. Parse time: 7.17457
[07/08/2024-15:31:28] [I] [TRT] Local timing cache in use. Profiling results in this builder pass will not be stored.
[07/08/2024-15:37:46] [I] [TRT] Detected 1 inputs and 1 output network tensors.
[07/08/2024-15:37:47] [I] [TRT] Total Host Persistent Memory: 270960
[07/08/2024-15:37:47] [I] [TRT] Total Device Persistent Memory: 2048
[07/08/2024-15:37:47] [I] [TRT] Total Scratch Memory: 18038784
[07/08/2024-15:37:47] [I] [TRT] [BlockAssignment] Started assigning block shifts. This will take 81 steps to complete.
[07/08/2024-15:37:47] [I] [TRT] [BlockAssignment] Algorithm ShiftNTopDown took 1.4107ms to assign 7 blocks to 81 nodes requiring 48448512 bytes.
[07/08/2024-15:37:47] [I] [TRT] Total Activation Memory: 48447488
[07/08/2024-15:37:48] [I] [TRT] Total Weights Memory: 5106764292
[07/08/2024-15:37:48] [I] [TRT] Engine generation completed in 379.539 seconds.
[07/08/2024-15:37:48] [I] [TRT] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 16 MiB, GPU 4871 MiB
[07/08/2024-15:37:49] [I] [TRT] [MemUsageStats] Peak memory usage during Engine building and serialization: CPU: 17036 MiB
[07/08/2024-15:37:49] [I] Engine built in 381.02 sec.
[07/08/2024-15:37:49] [I] Created engine with size: 4873.53 MiB
[07/08/2024-15:38:03] [I] [TRT] Loaded engine size: 4873 MiB
[07/08/2024-15:38:04] [I] Engine deserialized in 5.079 sec.
[07/08/2024-15:38:04] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +46, now: CPU 0, GPU 4916 (MiB)
[07/08/2024-15:38:04] [I] Setting persistentCacheLimit to 0 bytes.
[07/08/2024-15:38:04] [I] Created execution context with device memory size: 46.2031 MiB
[07/08/2024-15:38:04] [I] Using random values for input input
[07/08/2024-15:38:04] [I] Input binding for input with dimensions 1x3x256x256 is created.
[07/08/2024-15:38:04] [I] Output binding for output with dimensions 1x1x256x256 is created.
[07/08/2024-15:38:04] [I] Starting inference
[07/08/2024-15:38:07] [I] Warmup completed 8 queries over 200 ms
[07/08/2024-15:38:07] [I] Timing trace has 122 queries over 3.06595 s
[07/08/2024-15:38:07] [I] 
[07/08/2024-15:38:07] [I] === Trace details ===
[07/08/2024-15:38:07] [I] Trace averages of 10 runs:
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.924 ms - Host latency: 25.0594 ms (enqueue 0.567786 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.8289 ms - Host latency: 24.9563 ms (enqueue 0.615607 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.8722 ms - Host latency: 25.0178 ms (enqueue 0.595221 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 25.1238 ms - Host latency: 25.2738 ms (enqueue 0.666321 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.9231 ms - Host latency: 25.0819 ms (enqueue 0.651514 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 25.0108 ms - Host latency: 25.1362 ms (enqueue 0.562244 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.9383 ms - Host latency: 25.0759 ms (enqueue 0.592993 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 25.0519 ms - Host latency: 25.1946 ms (enqueue 0.607483 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.9184 ms - Host latency: 25.0494 ms (enqueue 0.59812 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.7885 ms - Host latency: 24.94 ms (enqueue 0.583667 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.7143 ms - Host latency: 24.8637 ms (enqueue 0.628271 ms)
[07/08/2024-15:38:07] [I] Average on 10 runs - GPU latency: 24.7015 ms - Host latency: 24.8372 ms (enqueue 0.552588 ms)
[07/08/2024-15:38:07] [I] 
[07/08/2024-15:38:07] [I] === Performance summary ===
[07/08/2024-15:38:07] [I] Throughput: 39.7919 qps
[07/08/2024-15:38:07] [I] Latency: min = 24.7388 ms, max = 26.7462 ms, mean = 25.0361 ms, median = 25.0056 ms, percentile(90%) = 25.2325 ms, percentile(95%) = 25.2654 ms, percentile(99%) = 25.3673 ms
[07/08/2024-15:38:07] [I] Enqueue Time: min = 0.34436 ms, max = 0.874878 ms, mean = 0.603683 ms, median = 0.611435 ms, percentile(90%) = 0.713928 ms, percentile(95%) = 0.805542 ms, percentile(99%) = 0.86377 ms
[07/08/2024-15:38:07] [I] H2D Latency: min = 0.0700684 ms, max = 0.195801 ms, mean = 0.0950824 ms, median = 0.0847015 ms, percentile(90%) = 0.127808 ms, percentile(95%) = 0.135132 ms, percentile(99%) = 0.149414 ms
[07/08/2024-15:38:07] [I] GPU Compute Time: min = 24.5698 ms, max = 26.5984 ms, mean = 24.8946 ms, median = 24.8648 ms, percentile(90%) = 25.09 ms, percentile(95%) = 25.1146 ms, percentile(99%) = 25.1975 ms
[07/08/2024-15:38:07] [I] D2H Latency: min = 0.0290527 ms, max = 0.0839844 ms, mean = 0.046354 ms, median = 0.039093 ms, percentile(90%) = 0.0678101 ms, percentile(95%) = 0.0727539 ms, percentile(99%) = 0.0808105 ms
[07/08/2024-15:38:07] [I] Total Host Walltime: 3.06595 s
[07/08/2024-15:38:07] [I] Total GPU Compute Time: 3.03714 s
[07/08/2024-15:38:07] [I] Explanations of the performance metrics are printed in the verbose logs.
[07/08/2024-15:38:07] [I] 
&&&& PASSED TensorRT.trtexec [TensorRT v100100] # trtexec --onnx=./bottle/padim_fp32_onnx/weights/onnx/model.onnx --saveEngine=./bottle/padim_fp32_trt.engine

In [ ]:
# Update metadata for TensorRT model
metadata_onnx = json.load(open(f"./{category}/padim_fp32_onnx/weights/onnx/metadata.json"))
metadata_padim = json.load(open("metadata_padim.json"))
metadata_padim["image_threshold"] = metadata_onnx["image_threshold"]
metadata_padim["pixel_threshold"] = metadata_onnx["pixel_threshold"]
metadata_padim["min"] = metadata_onnx["min"]
metadata_padim["max"] = metadata_onnx["max"]
with open("metadata_padim.json", 'w') as f:
    json.dump(metadata_padim, f, indent=4)


# Inference TensorRT model
inferencer_trt = TrtInferencer(
    path=f"./{category}/padim_fp32_trt.engine",
    metadata=f"metadata_padim.json",
    task=task
)

data = [str(img) for img in inference_path.glob('*.png')]
output_path_trt, inference_time_trt, memory_trt, trt_output_images = test_model(inferencer_trt, data, 'trt')
print(f"Inference time: {inference_time_trt:.2f} seconds, Memory: {memory_trt/1024:.2f} GB")
Image.open(output_path_trt)
Reading metadata from file metadata_padim.json...
metadata:  {'task': 'segmentation', 'transform': {'__version__': '1.3.1', 'transform': {'__class_fullname__': 'Compose', 'p': 1.0, 'transforms': [{'__class_fullname__': 'Resize', 'always_apply': True, 'p': 1, 'height': 256, 'width': 256, 'interpolation': 1}, {'__class_fullname__': 'Normalize', 'always_apply': False, 'p': 1.0, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'max_pixel_value': 255.0}, {'__class_fullname__': 'ToTensorV2', 'always_apply': True, 'p': 1.0, 'transpose_mask': False}], 'bbox_params': None, 'keypoint_params': None, 'additional_targets': {'image': 'image', 'depth_image': 'image'}, 'is_check_shapes': True}}, 'image_threshold': 238.55474853515625, 'pixel_threshold': 160.8461456298828, 'min': 1.631107211112976, 'max': 666.3355712890625}
inference batchsize = 1
Reading engine from file ./bottle/padim_fp32_trt.engine...
warm up finished...
Average pixel-level F1 score: 0.7166882753372192
Average image-level F1 score: 0.9381791353225708
Average pixel-level AUROC: 0.9272202849388123
Average image-level AUROC: 0.9510656595230103
Inference time: 0.47 seconds, Memory: 5.46 GB
Out[ ]:
No description has been provided for this image
In [ ]:
# Comparison of the inference time and memory usage
inference_times = [inference_time_FP32, inference_time_fp16, inference_time_trt]
memory_usage = [memory_FP32 / 1024**3, memory_fp16 / 1024**3, memory_trt / 1024] # Convert to GB

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].bar(['FP32', 'FP16', 'TensorRT'], inference_times)
ax[0].set_title('Inference Time (GPU)')
ax[0].set_ylabel('Time (s)')
ax[1].bar(['FP32', 'FP16', 'TensorRT'], memory_usage)
ax[1].set_title('Memory Usage (VRAM)')
ax[1].set_ylabel('Memory (GB)')
for i, v in enumerate(inference_times):
    ax[0].text(i, v, str(round(v, 2)), ha='center', va='bottom')
for i, v in enumerate(memory_usage):
    ax[1].text(i, v, str(round(v, 2)), ha='center', va='bottom')

z = np.polyfit([0, 1, 2], inference_times, 1)
p = np.poly1d(z)
ax[0].plot([0, 1, 2], p([0, 1, 2]), "r--")
z = np.polyfit([0, 1, 2], memory_usage, 1)
p = np.poly1d(z)
ax[1].plot([0, 1, 2], p([0, 1, 2]), "r--")

plt.tight_layout()
output = Path(f"./{category}/inference_comparison.png")
plt.savefig(output)
Image.open(output)
Out[ ]:
No description has been provided for this image
In [ ]:
# Percentage improvement in inference time and memory usage with TensorRT
inference_time_improvement = (inference_time_FP32 - inference_time_trt) / inference_time_FP32 * 100
memory_improvement = (memory_usage[0] - memory_usage[2]) / memory_usage[0] * 100
print(f"Inference time improvement with TensorRT: {inference_time_improvement:.2f}%")
print(f"Memory usage improvement with TensorRT: {memory_improvement:.2f}%")
Inference time improvement with TensorRT: 85.62%
Memory usage improvement with TensorRT: -13.02%
In [ ]:
# Ccomparison of the inference time and memory usage using a line plot
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.plot(['FP32', 'FP16', 'TensorRT'], inference_times, label='Inference Time (GPU)', marker='o')
ax.set_ylabel('Time (s)')
ax.set_xlabel('Inference Type')
ax2 = ax.twinx()
ax2.plot(['FP32', 'FP16', 'TensorRT'], memory_usage, label='Memory Usage (VRAM)', marker='o', color='red')
ax2.set_ylabel('Memory (GB)')
plt.tight_layout()
fig.legend(loc=(0.7, 0.85))
output = Path(f"./{category}/inference_comparison_line.png")
plt.savefig(output)
Image.open(output)
Out[ ]:
No description has been provided for this image
In [ ]:
# Comparison between ONNX and OpenVINO
inference_times_2 = [inference_time_onnx, inference_time_openvino]
memory_usage_2 = [memory_onnx / 1024**3, memory_openvino / 1024**3] # Convert to GB

fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].bar(['ONNX', 'OpenVINO'], inference_times_2)
ax[0].set_title('Inference Time (CPU)')
ax[0].set_ylabel('Time (s)')
ax[1].bar(['ONNX', 'OpenVINO'], memory_usage_2)
ax[1].set_title('Memory Usage (RAM)')
ax[1].set_ylabel('Memory (GB)')
for i, v in enumerate(inference_times_2):
    ax[0].text(i, v, str(round(v, 2)), ha='center', va='bottom')
for i, v in enumerate(memory_usage_2):
    ax[1].text(i, v, str(round(v, 2)), ha='center', va='bottom')

z = np.polyfit([0, 1], inference_times_2, 1)
p = np.poly1d(z)
ax[0].plot([0, 1], p([0, 1]), "r--")
z = np.polyfit([0, 1], memory_usage_2, 1)
p = np.poly1d(z)
ax[1].plot([0, 1], p([0, 1]), "r--")

plt.tight_layout()
output = Path(f"./{category}/onnx_openvino_comparison.png")
plt.savefig(output)
Image.open(output)
Out[ ]:
No description has been provided for this image
In [ ]:
# Comparison of the inference time and memory usage using a line plot
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.plot(['ONNX', 'OpenVINO'], inference_times_2, label='Inference Time (CPU)', marker='o')
ax.set_ylabel('Time (s)')
ax.set_xlabel('Inference Type')
ax.set_ylim(min(inference_times_2) - 0.05, max(inference_times_2) + 0.05)
ax2 = ax.twinx()
ax2.plot(['ONNX', 'OpenVINO'], memory_usage_2, label='Memory Usage (RAM)', marker='o', color='red')
ax2.set_ylabel('Memory (GB)')
ax2.set_ylim(min(memory_usage_2) - 1, max(memory_usage_2) + 1)
plt.tight_layout()
fig.legend(loc=(0.7, 0.85))
output = Path(f"./{category}/onnx_openvino_comparison_line.png")
plt.savefig(output)
Image.open(output)
Out[ ]:
No description has been provided for this image
In [ ]:
def dice_coefficient(gt_img, pred_img):
    binary_gt = np.where(gt_img > 0, 1, 0).astype(np.uint8)
    binary_pred = np.where(pred_img > 0, 1, 0).astype(np.uint8)
    intersection = np.logical_and(binary_gt, binary_pred)
    dice = (2. * intersection.sum()) / (binary_gt.sum() + binary_pred.sum())
    return dice

def auroc_pixel_level(gt_img, pred_img):
    auroc = AUROC()
    pred_mask = torch.from_numpy(pred_img)
    pred_mask = torch.where(pred_mask > 0, torch.tensor(1), torch.tensor(0))
    gt_mask = torch.from_numpy(gt_img)
    gt_mask = torch.where(gt_mask > 0, torch.tensor(1), torch.tensor(0))
    pred_mask_flat = pred_mask.view(-1).float()
    gt_mask_flat = gt_mask.view(-1).long()
    pixel_lvl_auroc = auroc(pred_mask_flat, gt_mask_flat)
    return pixel_lvl_auroc.item()

# Comparison of the segmentation results of all the images for each inference type
segmentation_results = [fp32_output_images, fp_16_output_images, onnx_output_images, openvino_output_images, trt_output_images]
segmentation_results = [segmentation_results[col][row][1].segmentations for row in range(len(segmentation_results[0])) for col in range(len(segmentation_results))]
inferences = ['FP32', 'FP16', 'ONNX', 'OpenVINO', 'TensorRT']

# Create a 5x5 grid of images for comparison
fig, ax = plt.subplots(5, 5, figsize=(20, 20))

# Plot the images row by row
k = 0
for i in range(5):
    for j in range(5):
        ax[i, j].imshow(segmentation_results[k + j])
        ax[i, j].axis('off')
        ax[i, j].set_title(inferences[j])
        if j != 0:
            dice = dice_coefficient(segmentation_results[k], segmentation_results[k + j])
            auroc = auroc_pixel_level(segmentation_results[k], segmentation_results[k + j])
            ax[i, j].set_title(f'{inferences[j]}\nDice: {dice:.2f}\nAUROC: {auroc:.2f}')
    k = (j + 1) * (i + 1)
fig.tight_layout()
output_path = Path(f"./{category}/segmentation_comparison.png")
plt.savefig(output_path)
Image.open(output_path)
Out[ ]:
No description has been provided for this image
In [ ]: